Importing Libraries¶

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

Loading Dataset¶

In [2]:
df=pd.read_csv(r'D:\Languages\Netflix_EDA\netflix_title.csv')
df.head(5)
Out[2]:
show_id type title director cast country date_added release_year rating duration listed_in description
0 s1 Movie Dick Johnson Is Dead Kirsten Johnson NaN United States September 25, 2021 2020 PG-13 90 min Documentaries As her father nears the end of his life, filmm...
1 s2 TV Show Blood & Water NaN Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... South Africa September 24, 2021 2021 TV-MA 2 Seasons International TV Shows, TV Dramas, TV Mysteries After crossing paths at a party, a Cape Town t...
2 s3 TV Show Ganglands Julien Leclercq Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... NaN September 24, 2021 2021 TV-MA 1 Season Crime TV Shows, International TV Shows, TV Act... To protect his family from a powerful drug lor...
3 s4 TV Show Jailbirds New Orleans NaN NaN NaN September 24, 2021 2021 TV-MA 1 Season Docuseries, Reality TV Feuds, flirtations and toilet talk go down amo...
4 s5 TV Show Kota Factory NaN Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... India September 24, 2021 2021 TV-MA 2 Seasons International TV Shows, Romantic TV Shows, TV ... In a city of coaching centers known to train I...

Basic information of the dataset¶

In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB

Dropping unnecessary columns¶

In [4]:
df.drop(['show_id','description'],axis=1,inplace=True)
In [5]:
df.head(5)
Out[5]:
type title director cast country date_added release_year rating duration listed_in
0 Movie Dick Johnson Is Dead Kirsten Johnson NaN United States September 25, 2021 2020 PG-13 90 min Documentaries
1 TV Show Blood & Water NaN Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... South Africa September 24, 2021 2021 TV-MA 2 Seasons International TV Shows, TV Dramas, TV Mysteries
2 TV Show Ganglands Julien Leclercq Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... NaN September 24, 2021 2021 TV-MA 1 Season Crime TV Shows, International TV Shows, TV Act...
3 TV Show Jailbirds New Orleans NaN NaN NaN September 24, 2021 2021 TV-MA 1 Season Docuseries, Reality TV
4 TV Show Kota Factory NaN Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... India September 24, 2021 2021 TV-MA 2 Seasons International TV Shows, Romantic TV Shows, TV ...

Checking duplicates¶

In [6]:
df.duplicated().sum()
Out[6]:
0

Checking missing values¶

In [7]:
df.isnull().sum()
Out[7]:
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
dtype: int64
In [8]:
df.isnull().sum()/len(df)*100
Out[8]:
type             0.000000
title            0.000000
director        29.908028
cast             9.367549
country          9.435676
date_added       0.113546
release_year     0.000000
rating           0.045418
duration         0.034064
listed_in        0.000000
dtype: float64

Data Cleaning¶

In [9]:
df.fillna({'cast':'NA'}, inplace=True)
df.fillna({'director':'Na'},inplace=True)
df.fillna({'country':'-'},inplace=True)
df.fillna({'rating':'-'},inplace=True)
df.fillna({'duration':'-'},inplace=True)
df.fillna({'date_added':'-'},inplace=True)
In [10]:
df.isnull().sum()
Out[10]:
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
dtype: int64

Data Transformation¶

In [11]:
df.rename(columns={"listed_in":"categories"},inplace=True)
In [12]:
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month_name()
In [13]:
df.head(5)
Out[13]:
type title director cast country date_added release_year rating duration categories year_added month_added
0 Movie Dick Johnson Is Dead Kirsten Johnson NA United States 2021-09-25 2020 PG-13 90 min Documentaries 2021.0 September
1 TV Show Blood & Water Na Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... South Africa 2021-09-24 2021 TV-MA 2 Seasons International TV Shows, TV Dramas, TV Mysteries 2021.0 September
2 TV Show Ganglands Julien Leclercq Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... - 2021-09-24 2021 TV-MA 1 Season Crime TV Shows, International TV Shows, TV Act... 2021.0 September
3 TV Show Jailbirds New Orleans Na NA - 2021-09-24 2021 TV-MA 1 Season Docuseries, Reality TV 2021.0 September
4 TV Show Kota Factory Na Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... India 2021-09-24 2021 TV-MA 2 Seasons International TV Shows, Romantic TV Shows, TV ... 2021.0 September

Data Visualization¶

In [14]:
df['type'].unique()
Out[14]:
array(['Movie', 'TV Show'], dtype=object)
In [15]:
type_counts=df['type'].value_counts()
type_index=type_counts.index
plt.figure(figsize=(8, 6))
plt.pie(type_counts, labels=type_index, autopct=lambda p: '{:.1f}% ({:.0f})'.format(p, p * sum(type_counts) / 100),startangle=270, colors=['#000000','#ba0c0c'],explode = (0.06, 0),pctdistance=1.28)
plt.title('Distribution of Show Types')
plt.show()
No description has been provided for this image

Insights

  • Chart illustrates the distribution of show types.
  • Netflix produces more movie than TV show
In [16]:
df.rating.unique()
Out[16]:
array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR', '-',
       'TV-Y7-FV', 'UR'], dtype=object)
In [17]:
ratings_counts = df['rating'].value_counts()

# Filter the original DataFrame (df) based on ratings that occur more than 50 times
netflix_filtered = df[df['rating'].isin(ratings_counts[ratings_counts > 50].index)]

custom_palette={'Movie':'#000000','TV Show':'#ba0c0c'}
plt.figure(figsize=(14, 6))
chart = sns.countplot(data = netflix_filtered, x = 'rating', hue = 'type', palette = custom_palette)
plt.title('Content type as per ratings', fontsize = 15)
plt.xlabel('Rating', fontsize = 14)
plt.ylabel('No. of Rating',fontsize = 12)

legend_labels = netflix_filtered['type'].unique()
for label in legend_labels:
    total_count = netflix_filtered[netflix_filtered['type'] == label].shape[0]
    label_with_count = f"{label} ({total_count})"
    legend_labels[legend_labels == label] = label_with_count

chart.legend(labels=legend_labels, loc = 'center left', bbox_to_anchor=(0.850,0.80),
          ncol = 1, fancybox = True, shadow = True)

for bars in chart.containers:
    chart.bar_label(bars)
plt.show()
No description has been provided for this image

Insights

  • Netflix produces TV-MA, TV-14 and TV-PG shows more than other ratings shows.
In [18]:
# Extract the count of each unique director and select the top 10 directors

data = df['director'].value_counts().iloc[1:11].to_dict()
netflix2 = pd.DataFrame(list(data.items()), columns=['Director', 'Count'])

red_black_palette = ['#980a0a','#ba0c0c', '#635454','#dc4848','#383434','#564d4d']

fig = px.pie(netflix2, values='Count', names='Director', title='Top Directors and Their Contribution',
             hole=0.3, labels={'Count': 'Director Count'},color_discrete_sequence=red_black_palette)

fig.update_layout(height=600, width=800)
fig.update_layout(legend=dict(x=1.4, y=0.5, traceorder='normal', orientation='v'))
fig.update_layout(title_x=0.5)
fig.show()
In [19]:
df['release_year'].value_counts().sort_index().plot(kind='line', marker='o', markerfacecolor='#ba0c0c', color='#000000', figsize=(12, 6))

plt.title('Content Releases Over Time',fontsize=15)
plt.xlabel('Release Year',fontsize=14)
plt.show()
No description has been provided for this image
In [20]:
# Extract the top 10 genres from the 'categories' column and count their occurrences

top_genres = df['categories'].str.split(', ', expand=True).stack().value_counts().head(10)
df_treemap = pd.DataFrame({'Genre': top_genres.index, 'Number of Releases': top_genres.values})
df_treemap['percent'] = df_treemap['Number of Releases'] / df_treemap['Number of Releases'].sum()

netflix_color_palette = ['#980a0a','#ba0c0c', '#df0707','#d94747','#f48b8b','#a6a0a0','#8c8c8c','#5b5757','#3e3d3d','#000000']  # Netflix Red and White

# Create a custom treemap using plotly.graph_objects
fig = go.Figure(go.Treemap(
    labels=df_treemap['Genre'],
    parents=[""] * len(df_treemap['Genre']),
    values=df_treemap['Number of Releases'],
    textinfo="label+percent entry",
    marker=dict(colors=netflix_color_palette ,
    line=dict(width=2, color="black"))
    ))

fig.update_layout(title='Top 10 Netflix Genres',title_x=0.5, margin=dict(l=20, r=20, t=60, b=20), width=800, height=600)
fig.show()
In [21]:
df[df['country'] != '-']['country'].value_counts().head(4)
Out[21]:
country
United States     2818
India              972
United Kingdom     419
Japan              245
Name: count, dtype: int64
In [22]:
df_top4 = df[(df["country"]=="United States") | (df["country"]=="India") | (df["country"]=="United Kingdom") | 
             (df["country"]=="Japan")]
In [23]:
df_top10=df['rating'].value_counts().head(10)
In [24]:
top_ratings = df_top4['rating'].value_counts().head(10).index
df_top10_ratings = df_top4[df_top4['rating'].isin(top_ratings)]

cp={'Movie':'#000000','TV Show':'#ba0c0c'}

plt.figure(figsize=(18, 8))
sns.set_theme(style="white")

sns.stripplot(x="country", y="rating", hue="type", data=df_top10_ratings,
              order=["United States", "India", "United Kingdom", "Japan"],
              palette=cp, size=12)

plt.xlabel('Country', fontsize=19)
plt.ylabel('Rating', fontsize=19)
plt.legend(loc="best", title="Type", fontsize=12)
plt.title("Distribution of top 10 rating-based movies and shows in top 4 countries", fontsize=19)

plt.show()
No description has been provided for this image

Insights

  • In United States, Netflix prdouces all kind of shows evenly in all types of rating.
  • In India, Netflix produces very few shows of PG,PG-13,NR,R rated shows.
  • In United Kingdom, Netflix produces very few shows of PG,TV-Y7,NR rated shows.
  • In Japan, Netflix produces very few shows of TV-G, TV-Y, and PG-13 ratings, and does not produce R and NR rated shows.
In [25]:
# Filter the DataFrame to include only rows where the 'release_year' is greater than or equal to the current year minus 13

df_filtered = df[df['release_year'] >= (pd.Timestamp.now().year - 13)]
heatmap_data = df_filtered.groupby(['release_year', 'month_added']).size().unstack().fillna(0)
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
heatmap_data = heatmap_data[month_order]

plt.figure(figsize=(14, 6))
sns.heatmap(heatmap_data.T, annot=False, cmap="RdGy", fmt='g')  

plt.title('Netflix Content Releases Over Month')
plt.xlabel('Year',fontsize=15)
plt.ylabel('Month',fontsize=14)  

plt.show()
No description has been provided for this image
In [26]:
content_type_distribution = df.groupby(['year_added', 'type']).size().unstack().fillna(0)
netflix_palette = {'Movie':'#000000','TV Show':'#ba0c0c'}
content_type_distribution.plot.area(stacked=True, color=[netflix_palette.get(type, '#CCCCCC') for type in content_type_distribution.columns], figsize=(12, 8))

plt.title('Distribution of Content Types Over Time')
plt.xlabel('Year')
plt.legend(title='Content Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
No description has been provided for this image
In [27]:
# Filter the DataFrame to include only rows where the 'country' column is not equal to '-',considering only the top 5 countries
country_name = df[df['country']!='-']['country'].value_counts().head(5)
country_name
Out[27]:
country
United States     2818
India              972
United Kingdom     419
Japan              245
South Korea        199
Name: count, dtype: int64
In [28]:
red_black_palette = {
    'United States': '#bd1e1e',
    'India': '#df0c0c',
    'United Kingdom': '#fc0a0a',
    'Japan': '#e15656',
    'South Korea': '#f29595'
}

fig = px.scatter(
    country_name, x=country_name.values, y=country_name.index,
    color=country_name.index,
    color_discrete_map=red_black_palette,
    size=country_name.values,  # Use the 'Values' column to determine marker sizes
    size_max=100,
    labels={'Values': country_name.values, 'Country': country_name.index},
)

fig.update_layout(
    title='Top 5 Countries with the Most Netflix Content',
    title_x=0.5,
    height=600,
    xaxis_title='Values',
    yaxis_title='Country',
    xaxis_title_font=dict(
        family='Arial',
        size=24,
    ),
     yaxis_title_font=dict(
        family='Arial',
        size=24,
    ),
    plot_bgcolor='Black',
)

fig.show()

WordCloud of Cast¶

In [29]:
txt = " ".join(cast for cast in df.cast)
word_cloud = WordCloud(collocations = False, background_color = 'black',
                        width = 2048, height = 1080,colormap='RdGy').generate(txt)
plt.figure(figsize=(30,10))
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()
No description has been provided for this image

WordCloud of Title¶

In [30]:
txt1 = " ".join(title for title in df.title)
word_cloud1 = WordCloud(collocations = False, background_color = 'black',
                        width = 2048, height = 1080,colormap='RdGy').generate(txt1)
plt.figure(figsize=(30,10))
plt.imshow(word_cloud1, interpolation='bilinear')
plt.axis("off")
plt.show()
No description has been provided for this image
In [ ]: